import pandas as pd

c = 1000

df1 = pd.read_csv('../data/gpo_final_data/narratives_all_with_metadata_{0}_no_frequency_filter.csv'.format(c))
df2 = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_{0}_no_frequency_filter.csv'.format(c))
df3 = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_{0}.csv'.format(c))
df4 = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_manual_labels_rich_{0}.csv'.format(c))

import numpy as np
df1['ARG1'].fillna(df1['ARG2'], inplace=True)
df1['ARG1-RAW'].fillna(df1['ARG2-RAW'], inplace=True)
df1['B-V-RAW'] = np.where(df1['B-ARGM-NEG-RAW'] == True, 'not-' + df1['B-V-RAW'], df1['B-V-RAW'])
df1.fillna('', inplace = True)
df1['narrative'] = df1['ARGO'] + ' ' + df1['B-V-RAW'] + ' ' + df1['ARG1']
df1['narrative'] = df1['narrative'].str.strip()

summary_statistics = []
summary_statistics.append(['Speeches', len(df1.gpo_id.unique()), len(df2.gpo_id.unique()), len(df3.gpo_id.unique()), len(df4.gpo_id.unique())])
summary_statistics.append(['Sentences', len(df1.sentence_raw.unique()), len(df2.sentence_raw.unique()), len(df3.sentence_raw.unique()), len(df4.sentence_raw.unique())])
summary_statistics.append(['Statements', len(df1), len(df2), len(df3), len(df4)])
summary_statistics.append(['Narratives, unique', len(set(df1.narrative)), len(set(df2.narrative)), len(set(df3.narrative)), len(set(df4.narrative))])
summary_statistics.append(['Agents raw', len(df1.dropna(subset=['ARGO-RAW'])), len(df2.dropna(subset=['ARGO-RAW'])), len(df3.dropna(subset=['ARGO-RAW'])), len(df4.dropna(subset=['ARGO-RAW']))])
summary_statistics.append(['Agents raw, unique', len(df1['ARGO-RAW'].unique()), len(df2['ARGO-RAW'].unique()), len(df3['ARGO-RAW'].unique()), len(df4['ARGO-RAW'].unique())])
summary_statistics.append(['Agents clustered, unique', len(df1['ARGO'].unique()), len(df2['ARGO'].unique()), len(df3['ARGO'].unique()), len(df4['ARGO'].unique())])
summary_statistics.append(['Patients raw', len(df1.dropna(subset=['ARG1-RAW'])), len(df2.dropna(subset=['ARG1-RAW'])), len(df3.dropna(subset=['ARG1-RAW'])), len(df4.dropna(subset=['ARG1-RAW']))])
summary_statistics.append(['Patients raw, unique', len(df1['ARG1-RAW'].unique()), len(df2['ARG1-RAW'].unique()), len(df3['ARG1-RAW'].unique()), len(df4['ARG1-RAW'].unique())])
summary_statistics.append(['Patients clustered, unique', len(df1['ARG1'].unique()), len(df2['ARG1'].unique()), len(df3['ARG1'].unique()), len(df4['ARG1'].unique())])
summary_statistics.append(['Verbs raw', len(df1.dropna(subset=['B-V-RAW'])), len(df2.dropna(subset=['B-V-RAW'])), len(df3.dropna(subset=['B-V-RAW'])), len(df4.dropna(subset=['B-V-RAW']))])
summary_statistics.append(['Verbs raw, unique', len(df1['B-V-RAW'].unique()), len(df2['B-V-RAW'].unique()), len(df3['B-V-RAW'].unique()), len(df4['B-V-RAW'].unique())])
summary_statistics.append(['Verbs cleaned, unique', len(df1['B-V-CLEANED'].unique()), len(df2['B-V-CLEANED'].unique()), len(df3['B-V-CLEANED'].unique()), len(df4['B-V-CLEANED'].unique())])
summary_statistics = pd.DataFrame(summary_statistics, columns=['Variable', 'All Narratives', 'Complete Narratives', 'Frequent Narratives', 'Relevant Narratives'])
summary_statistics.to_latex('../tables/Table_D_1.tex'.format(c), index=False)
